{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 冰面滑行 FrozenLake-v0"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "np.random.seed(0)\n",
    "import gym"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 环境使用"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "观察空间 = Discrete(16)\n",
      "动作空间 = Discrete(4)\n",
      "观测空间大小 = 16\n",
      "动作空间大小 = 4\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "[(0.3333333333333333, 14, 0.0, False),\n",
       " (0.3333333333333333, 15, 1.0, True),\n",
       " (0.3333333333333333, 10, 0.0, False)]"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "env = gym.make('FrozenLake-v0')\n",
    "env.seed(0)\n",
    "print('观察空间 = {}'.format(env.observation_space))\n",
    "print('动作空间 = {}'.format(env.action_space))\n",
    "print('观测空间大小 = {}'.format(env.unwrapped.nS))\n",
    "print('动作空间大小 = {}'.format(env.unwrapped.nA))\n",
    "env.unwrapped.P[14][2] # 查看动力"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "用随机策略玩"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "def play_policy(env, policy, render=False):\n",
    "    total_reward = 0.\n",
    "    observation = env.reset()\n",
    "    while True:\n",
    "        if render:\n",
    "            env.render() # 此行可显示\n",
    "        action = np.random.choice(env.action_space.n,\n",
    "                p=policy[observation])\n",
    "        observation, reward, done, _ = env.step(action)\n",
    "        total_reward += reward  # 统计回合奖励\n",
    "        if done: # 游戏结束\n",
    "            break\n",
    "    return total_reward"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "随机策略 平均奖励：0.04\n"
     ]
    }
   ],
   "source": [
    "# 随机策略\n",
    "random_policy = \\\n",
    "        np.ones((env.unwrapped.nS, env.unwrapped.nA)) / env.unwrapped.nA\n",
    "\n",
    "episode_rewards = [play_policy(env, random_policy)  for _ in range(100)]\n",
    "print(\"随机策略 平均奖励：{}\".format(np.mean(episode_rewards)))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 策略评估"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "def v2q(env, v, s=None, gamma=1.): # 根据状态价值函数计算动作价值函数\n",
    "    if s is not None: # 针对单个状态求解\n",
    "        q = np.zeros(env.unwrapped.nA)\n",
    "        for a in range(env.unwrapped.nA):\n",
    "            for prob, next_state, reward, done in env.unwrapped.P[s][a]:\n",
    "                q[a] += prob * \\\n",
    "                        (reward + gamma * v[next_state] * (1. - done))\n",
    "    else: # 针对所有状态求解\n",
    "        q = np.zeros((env.unwrapped.nS, env.unwrapped.nA))\n",
    "        for s in range(env.unwrapped.nS):\n",
    "            q[s] = v2q(env, v, s, gamma)\n",
    "    return q\n",
    "\n",
    "def evaluate_policy(env, policy, gamma=1., tolerant=1e-6):\n",
    "    v = np.zeros(env.unwrapped.nS) # 初始化状态价值函数\n",
    "    while True: # 循环\n",
    "        delta = 0\n",
    "        for s in range(env.unwrapped.nS):\n",
    "            vs = sum(policy[s] * v2q(env, v, s, gamma)) # 更新状态价值函数\n",
    "            delta = max(delta, abs(v[s]-vs)) # 更新最大误差\n",
    "            v[s] = vs # 更新状态价值函数\n",
    "        if delta < tolerant: # 查看是否满足迭代条件\n",
    "            break\n",
    "    return v"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "评估随机策略的价值函数"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "状态价值函数：\n",
      "[[0.0139372  0.01162942 0.02095187 0.01047569]\n",
      " [0.01624741 0.         0.04075119 0.        ]\n",
      " [0.03480561 0.08816967 0.14205297 0.        ]\n",
      " [0.         0.17582021 0.43929104 0.        ]]\n",
      "动作价值函数：\n",
      "[[0.01470727 0.01393801 0.01393801 0.01316794]\n",
      " [0.00852221 0.01162969 0.01086043 0.01550616]\n",
      " [0.02444416 0.0209521  0.02405958 0.01435233]\n",
      " [0.01047585 0.01047585 0.00698379 0.01396775]\n",
      " [0.02166341 0.01701767 0.0162476  0.01006154]\n",
      " [0.         0.         0.         0.        ]\n",
      " [0.05433495 0.04735099 0.05433495 0.00698396]\n",
      " [0.         0.         0.         0.        ]\n",
      " [0.01701767 0.04099176 0.03480569 0.04640756]\n",
      " [0.0702086  0.11755959 0.10595772 0.05895286]\n",
      " [0.18940397 0.17582024 0.16001408 0.04297362]\n",
      " [0.         0.         0.         0.        ]\n",
      " [0.         0.         0.         0.        ]\n",
      " [0.08799662 0.20503708 0.23442697 0.17582024]\n",
      " [0.25238807 0.53837042 0.52711467 0.43929106]\n",
      " [0.         0.         0.         0.        ]]\n"
     ]
    }
   ],
   "source": [
    "print('状态价值函数：')\n",
    "v_random = evaluate_policy(env, random_policy)\n",
    "print(v_random.reshape(4, 4))\n",
    "\n",
    "print('动作价值函数：')\n",
    "q_random = v2q(env, v_random)\n",
    "print(q_random)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 策略改进"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "def improve_policy(env, v, policy, gamma=1.):\n",
    "    optimal = True\n",
    "    for s in range(env.unwrapped.nS):\n",
    "        q = v2q(env, v, s, gamma)\n",
    "        a = np.argmax(q)\n",
    "        if policy[s][a] != 1.:\n",
    "            optimal = False\n",
    "            policy[s] = 0.\n",
    "            policy[s][a] = 1.\n",
    "    return optimal"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "对随机策略进行改进"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "有更新，更新后的策略为：\n",
      "[[1. 0. 0. 0.]\n",
      " [0. 0. 0. 1.]\n",
      " [1. 0. 0. 0.]\n",
      " [0. 0. 0. 1.]\n",
      " [1. 0. 0. 0.]\n",
      " [1. 0. 0. 0.]\n",
      " [1. 0. 0. 0.]\n",
      " [1. 0. 0. 0.]\n",
      " [0. 0. 0. 1.]\n",
      " [0. 1. 0. 0.]\n",
      " [1. 0. 0. 0.]\n",
      " [1. 0. 0. 0.]\n",
      " [1. 0. 0. 0.]\n",
      " [0. 0. 1. 0.]\n",
      " [0. 1. 0. 0.]\n",
      " [1. 0. 0. 0.]]\n"
     ]
    }
   ],
   "source": [
    "policy = random_policy.copy()\n",
    "optimal = improve_policy(env, v_random, policy)\n",
    "if optimal:\n",
    "    print('无更新，最优策略为：')\n",
    "else:\n",
    "    print('有更新，更新后的策略为：')\n",
    "print(policy)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 策略迭代"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "状态价值函数 =\n",
      "[[0.82351246 0.82350689 0.82350303 0.82350106]\n",
      " [0.82351416 0.         0.5294002  0.        ]\n",
      " [0.82351683 0.82352026 0.76469786 0.        ]\n",
      " [0.         0.88234658 0.94117323 0.        ]]\n",
      "最优策略 =\n",
      "[[0 3 3 3]\n",
      " [0 0 0 0]\n",
      " [3 1 0 0]\n",
      " [0 2 1 0]]\n"
     ]
    }
   ],
   "source": [
    "def iterate_policy(env, gamma=1., tolerant=1e-6):\n",
    "     # 初始化为任意一个策略\n",
    "    policy = np.ones((env.unwrapped.nS, env.unwrapped.nA)) \\\n",
    "            / env.unwrapped.nA\n",
    "    while True:\n",
    "        v = evaluate_policy(env, policy, gamma, tolerant) # 策略评估\n",
    "        if improve_policy(env, v, policy): # 策略改进\n",
    "            break\n",
    "    return policy, v\n",
    "\n",
    "policy_pi, v_pi = iterate_policy(env)\n",
    "print('状态价值函数 =')\n",
    "print(v_pi.reshape(4, 4))\n",
    "print('最优策略 =')\n",
    "print(np.argmax(policy_pi, axis=1).reshape(4, 4))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "测试策略"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "策略迭代 平均奖励：0.77\n"
     ]
    }
   ],
   "source": [
    "episode_rewards = [play_policy(env, policy_pi)  for _ in range(100)]\n",
    "print(\"策略迭代 平均奖励：{}\".format(np.mean(episode_rewards)))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 价值迭代"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "状态价值函数 =\n",
      "[[0.82351232 0.82350671 0.82350281 0.82350083]\n",
      " [0.82351404 0.         0.52940011 0.        ]\n",
      " [0.82351673 0.82352018 0.76469779 0.        ]\n",
      " [0.         0.88234653 0.94117321 0.        ]]\n",
      "最优策略 =\n",
      "[[0 3 3 3]\n",
      " [0 0 0 0]\n",
      " [3 1 0 0]\n",
      " [0 2 1 0]]\n"
     ]
    }
   ],
   "source": [
    "def iterate_value(env, gamma=1, tolerant=1e-6):\n",
    "    v = np.zeros(env.unwrapped.nS) # 初始化\n",
    "    while True:\n",
    "        delta = 0\n",
    "        for s in range(env.unwrapped.nS):\n",
    "            vmax = max(v2q(env, v, s, gamma)) # 更新价值函数\n",
    "            delta = max(delta, abs(v[s]-vmax))\n",
    "            v[s] = vmax\n",
    "        if delta < tolerant: # 满足迭代需求\n",
    "            break\n",
    "            \n",
    "    policy = np.zeros((env.unwrapped.nS, env.unwrapped.nA)) # 计算最优策略\n",
    "    for s in range(env.unwrapped.nS):\n",
    "        a = np.argmax(v2q(env, v, s, gamma))\n",
    "        policy[s][a] = 1.\n",
    "    return policy, v\n",
    "\n",
    "policy_vi, v_vi = iterate_value(env)\n",
    "print('状态价值函数 =')\n",
    "print(v_vi.reshape(4, 4))\n",
    "print('最优策略 =')\n",
    "print(np.argmax(policy_vi, axis=1).reshape(4, 4))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "测试策略"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "价值迭代 平均奖励：0.7\n"
     ]
    }
   ],
   "source": [
    "episode_rewards = [play_policy(env, policy_vi) for _ in range(100)]\n",
    "print(\"价值迭代 平均奖励：{}\".format(np.mean(episode_rewards)))"
   ]
  }
 ],
 "metadata": {
  "anaconda-cloud": {},
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
